It has been well documented that live birth follows a seasonal pattern with more babies born in Spring/Summer in most countries of the northen Hemisphere.
We downloaded monthly live birth data from the UN [http://data.un.org/Data.aspx?d=POP&f=tableCode%3A55]
birth = read.csv(paste0(IO$birth_records_dir,"UNdata_Export_20180713_023958101_all.csv"), header = TRUE)
# removing the rows with the total number of birth - only keeping the monthly data
months = format(ISOdate(2004,1:12,1),"%B")
j = birth$Month %in% months
birth = birth[which(j),]
rm(j)
# proper date
birth$date = as.Date(paste0(birth$Year, "-",birth$Month,"-01"), format = c("%Y-%B-%d"))
range(birth$date)## [1] "1967-01-01" "2017-12-01"
# matching country names with countriesLow Names
birth$Country.or.Area = gsub("United States of America","United States", birth$Country.or.Area)
# getting geo-location
mat = match(birth$Country.or.Area, countriesLow$NAME)
birth$lat = countriesLow$LAT[mat]
birth$lon = countriesLow$LON[mat]
rm(mat)
# sorting countries by latitude
countries.levels = unique(birth$Country.or.Area)
m = match(countries.levels,countriesLow$NAME )
lat = countriesLow$LAT[m]
o = order(lat, decreasing = TRUE)
countries.levels = countries.levels[o]
birth$Country.or.Area = factor(birth$Country.or.Area, levels = countries.levels)
save(birth, file = paste0(IO$out_Rdata,"UN_birth_data.Rdata"))
rm(countries.levels, m, lat, o)# "Australia", "New Zealand","Bermuda", "Greenland", "Guadeloupe",
countries = c("Sweden","Denmark","Finland","Canada","Germany", "France", "Switzerland", "United States","Israel","Japan","Guatemala","American Samoa","New Caledonia","Seychelles","Chile")
# order countries by decreasing latitude
mat = match(countries, countriesLow$NAME)
lat = countriesLow$LAT[mat]
o = order(lat, decreasing = TRUE)
countries = countries[o]
rm(o, lat, mat)
# select countries. Selected countries had long term data and included those from Northern and Southern hemisphere
c = which(birth$Country.or.Area %in% countries)
g = ggplot(birth[c,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + ggtitle("Absolute number of live birth for selected countries")g = ggplot(birth[c,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + facet_grid( Country.or.Area ~ . , scales = "free_y") +
theme(strip.text.y = element_text(angle = 90)) +
ggtitle("Number of live birth for selected countries /!\ y-scale does not include 0")countries2 = countries[-which(countries %in% c("Guatemala","Switzerland","Finland"))]
cc = which((birth$Country.or.Area %in% countries2) & (birth$Year >= 2000))
g = ggplot(birth[cc,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + facet_grid( Country.or.Area ~ . , scales = "free_y") +
theme(strip.text.y = element_text(angle = 0,hjust = 0),
axis.text.y = element_blank(),
axis.ticks.y = element_blank())+ #expand_limits(y=25)+
guides(col = FALSE)+ ylab("Number of live births (min to max values)")+
ggtitle("Number of live birth for selected countries in the last 2 decades")long_ago = 1978:1981
cc = which((birth$Country.or.Area %in% c("Finland","France","United States", "Israel")) & (birth$Year %in% long_ago))
g = ggplot(birth[cc,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + facet_grid( Country.or.Area ~ . , scales = "free_y") +
theme(strip.text.y = element_text(angle = 90)) +
ggtitle("Number of live birth for selected countries in the late 70's")cc = which((birth$Country.or.Area %in% countries) & (birth$Year %in% long_ago))
g = ggplot(birth[cc,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + facet_grid( Country.or.Area ~ . , scales = "free_y") +
theme(strip.text.y = element_text(angle = 90)) +
ggtitle("Number of live birth for selected countries in the late 70's")recently = 2011:2013
cc = which((birth$Country.or.Area %in% c("Finland","France","United States", "Israel")) & (birth$Year %in% recently))
g = ggplot(birth[cc,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + facet_grid( Country.or.Area ~ . , scales = "free_y") +
theme(strip.text.y = element_text(angle = 90)) +
ggtitle("Number of live birth for selected countries in recent years")sh = which(birth$lat < 0)
g = ggplot(birth[sh,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + facet_grid( Country.or.Area ~ . , scales = "free_y") +
theme(strip.text.y = element_text(angle = 90)) +
ggtitle("Number of live birth for selected countries in the SOUTH hemisphere")rm(sh)Martinez-Bakker et al., 2014 have shown that the peak time of birth follows a pattern that depends on the latitude of the considered country.
To better understand this relationship and to later investigate the potential origin of birth seasonality, we do a rhythm analysis of the birth patterns to determine the phase (= the peak time) and the relative amplitude of the birth signals, taken year by year.
BR = data.frame()
for(country in unique(birth$Country.or.Area)){
c = which(birth$Country.or.Area == country)
b = birth[c,]
years = table(b$Year)
years = names(years)[years == 12]
for(y in years){
j = which(b$Year == y)
pft = periodic.fisher.test(t = 1:12, x = b$Value[j], p = 12)
line = data.frame(country = country, year = as.numeric(y), pval = pft$pval , phase = pft$phase, rel.ampl = pft$rel.amp)
BR = rbind(BR, line)
}
}
rm(line,c, b, j,y, country, pft, years)
hist(BR$pval, breaks = 50)# we expect p-value to be small if there is rhythmicity
BR$qval = p.adjust(BR$pval, method = "BH")# and plot the phase (in month), show the p-value and order by latitude.
m = match(BR$country, countriesLow$NAME)
BR$lat = countriesLow$LAT[m]
BR$lon = countriesLow$LON[m]
BR$significant_rhythm = BR$qval<=0.1
BR$month = BR$phase
BR$latitude = BR$lat
rm(m)
g = ggplot(BR, aes(x = phase, y= lat, col = year, alpha = (qval<=0.1)/2+0.3))
g + geom_point() + xlim(c(0,12)) + scale_x_continuous(breaks = 0:12) +
ggtitle("Peak month of birth by latitude and years")## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Warning: Removed 658 rows containing missing values (geom_point).
g = ggplot(BR, aes(x = phase, y= lat, col = year, alpha = qval))
g + geom_point() + xlim(c(0,12)) + scale_x_continuous(breaks = 0:12) +
scale_alpha(range = c(0.1,1))## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Warning: Removed 658 rows containing missing values (geom_point).
ggtitle("Peak month of birth by latitude and years")## $title
## [1] "Peak month of birth by latitude and years"
##
## attr(,"class")
## [1] "labels"
g = ggplot(BR, aes(x = month, y= latitude, col = year, alpha = significant_rhythm))
g + geom_point(size = 0.75) + #xlim(c(0,12)) +
geom_hline(yintercept = 0)+
scale_x_continuous(breaks = 0:12) +
scale_alpha_discrete(range = c(0.2,1))+
scale_color_gradient(low = "red4", high = "coral1")+
ggtitle("Peak month of birth by latitude and years")## Warning: Using alpha for a discrete variable is not advised.
## Warning: Removed 658 rows containing missing values (geom_point).
#mapw = map_data("world")
#g = ggplot()
#g + geom_polygon(data = mapw, aes(x=long, y = lat, group = group), fill= NA, col = "gray") +
# geom_point(data =BR, aes(x = lon + year-2000, y = lat, col = phase, alpha = (qval<=0.1)/2+0.3))
# non-transparent means significant. Phase is the month number
#rm(mapw)We indeed observe this latitude-dependency for the peak time of birth. Interestingly, for some countries in the Nothern hemisphere, we also see a shift towards later later peak time in recent years.
Having a closer look to some countries with measurements for many years:
c = which(BR$country %in% countries)
g = ggplot(BR[c,], aes(x = phase, y = year))
g + geom_point(aes(col = (qval <= 0.1))) +
facet_grid( country ~ .) +
theme_minimal() +
theme(strip.text.y = element_text(angle = 0))+
ggtitle("Shift in birth month since the 70's for many countries")# note that for some countries, the trend for the phase does change over the years
subset_countries = c("Sweden","Denmark","Finland","Canada","Germany", "France", "Switzerland", "United States","Israel","Japan")
g = ggplot(BR[BR$country %in% subset_countries, ], aes(x = phase, y = year, col = country))
g + geom_point(aes(alpha = (qval <= 0.1)/2+0.5)) +
geom_path(data = BR[(BR$country %in% subset_countries) & (BR$qval <= 0.1),], aes(x = phase, y = year, col = country)) +
xlim(c(0,12))+
theme_minimal() +
theme(strip.text.y = element_text(angle = 0))+
ggtitle("Except for the US and Israel, most countries have seen their peak month of birth being delayed")??? Maybe should first de-trend the data before computing the phase? But it does not seem to correlate with the overall trends…
We were then wondering if the amplitude of the oscillations were also dependent on latitude.
g = ggplot(BR, aes(x = rel.ampl , y = lat, col = year, alpha = (qval<=0.1)/2+0.3))
g + geom_point() + ggtitle("Relative amplitude of birth seasonality by latitude")## Warning: Removed 658 rows containing missing values (geom_point).
It does not seem to be the case.
In EU and US (which is the main population of the two apps) in recent years, the phases are quite similar (despite the relatively large variation in latitude), we can thus use Sympto and Kindara’s dataset to explore the origin of seasonality: i.e. is it a change in sexual behavior or a change in fertility?
countries.s = c("Germany", "Switzerland", "France","United States")
g = ggplot(BR[BR$country %in% countries.s,], aes(x = phase, y = year, col = country))
g + geom_point() + xlim(c(0,12)) + geom_hline(yintercept = 2010) +
ggtitle("Countries for which we have users in Sympto's dataset have similar peak month in the last decade")The peak birth month for these countries in recent years are July-August. The estimated time of conception is thus ~ November.
For Sympto, we have both location and user’s birth year (age). For Kindara, we currently do not have access to this information. Unfortunately, Sympto’s dataset is a lot smaller than Kindara’s dataset.
Clue, Kindara, Sympto
User table:
users = read_feather(path = paste0(IO$input_clue,"users.feather"))
head(users)## # A tibble: 6 x 7
## user_id birth_year_bin country_area height_bin weight_bin
## <chr> <chr> <chr> <chr> <chr>
## 1 73ce7b… 1995-1999 United Stat… 160-164 65-69
## 2 235eb9… 2000-2004 United Stat… 160-164 <NA>
## 3 34be79… 1995-1999 United Stat… 165-169 60-64
## 4 d21cf3… 2000-2004 United Stat… <155 50-54
## 5 0169d0… 1985-1989 United Stat… 160-164 55-59
## 6 caec40… 1975-1979 United Stat… 165-169 65-69
## # … with 2 more variables: latest_birth_control <chr>, csv_file <chr>
dim(users)## [1] 12000 7
cycles_00 = read_feather(path = paste0(IO$input_clue,"cycles/cycles0000.feather"))
head(cycles_00)## # A tibble: 6 x 11
## user_id cycle_nb cycle_start cycle_end cycle_length period_start
## <chr> <int> <date> <date> <int> <date>
## 1 77fab8… 3 2017-05-18 2017-06-15 29 2017-05-18
## 2 40ba99… 11 2018-07-04 2018-07-15 12 2018-07-04
## 3 bd760e… 12 2018-07-28 2018-08-19 23 2018-07-28
## 4 0125bd… 15 2018-12-16 2019-01-19 35 2018-12-16
## 5 a4bae9… 3 2018-07-17 2018-08-14 29 2018-07-17
## 6 f1c046… 27 2019-05-23 2019-06-18 27 2019-05-23
## # … with 5 more variables: period_end <date>, period_length <int>,
## # neg_preg_test <lgl>, pos_preg_test <lgl>, latest_preg_test <chr>
dim(cycles_00)## [1] 6679 11
tracking_folder = paste0(IO$input_clue,"tracking/")
files = list.files(tracking_folder)
tracking = read_feather(path = paste0(tracking_folder,files[1]))
head(tracking)## # A tibble: 6 x 5
## user_id date category type number
## <chr> <date> <chr> <chr> <chr>
## 1 8dab364e26c969e3f242d1c37158d4b9… 2018-09-11 pain tender_brea… <NA>
## 2 64460b1cbc44d72f23a5faed63741e65… 2017-10-19 period heavy <NA>
## 3 7c2bda3a9fd5a18377d397a2748bbb69… 2018-04-29 ANY <NA> <NA>
## 4 5a43343dc6524f888f90b3a408d9800e… 2018-09-15 ANY <NA> <NA>
## 5 65a8fc2d6659af080fe72cc9d9ea2960… 2017-08-18 ANY <NA> <NA>
## 6 2a2c3ee5b3f1567ad56282a907efe657… 2018-09-19 period light <NA>
unique(tracking$category)## [1] "pain" "period" "ANY" "digestion"
## [5] "sleep" "emotion" "energy" "exercise"
## [9] "mental" "pill_hbc" "sex" "fluid"
## [13] "social" "ailment" "iud" "poop"
## [17] "weight" "medication" "bbt" "injection_hbc"
Workflow:
Split Country and Area
Compute estimated mean BMI
Re-assemble the tracking tables into batches, making sure a single user has all its tracking in the same batch
Add the users features to the tracking table
Label user’s time-series
Birth control (as declared by users)
Birth control (re-assignment based on users’ declaration and on their tracking history - see Breast Tenderness paper method)
User’s tracking activity (convolution by X days from “ANY” tracking to reflect that the user was using the app)
countries_areas = users$country_area %>% str_split_fixed(., " - ", n = 2)
users$country = countries_areas[,1]
users$area = countries_areas[,2]
write_feather(users, path = paste0(IO$output_clue, "users.feather"))
file.copy(from = paste0(IO$output_clue, "users.feather"), to = paste0(IO$tmp_clue, "users_with_countries.feather"))## [1] FALSE
users$height_bin = factor(users$height_bin, levels = dict$height$bin)
users$weight_bin = factor(users$weight_bin, levels = dict$weight$bin)
heigh_mean = dict$height$mean[match(users$height_bin,dict$height$bin)]
weight_mean = dict$weight$mean[match(users$weight_bin,dict$weight$bin)]
users$est_mean_bmi = weight_mean/((heigh_mean/100)^2)
write_feather(users, path = paste0(IO$output_clue, "users.feather"))
file.copy(from = paste0(IO$output_clue, "users.feather"), to = paste0(IO$tmp_clue, "users_with_bmi.feather"))## [1] TRUE
ggplot(users, aes(x = est_mean_bmi, fill = country_area))+
geom_histogram(binwidth = 2)+
facet_grid(country_area ~ ., scale = "free_y")## Warning: Removed 2730 rows containing non-finite values (stat_bin).
max_batch_size = 5000
n_batch = max(par$min_n_batches, ceiling(nrow(users)/max_batch_size))
batch_size = ceiling(nrow(users)/n_batch)
users$batch = rep(1:n_batch, each = batch_size)[1:nrow(users)]
write_feather(users, path = paste0(IO$output_clue, "users.feather"))
file.copy(from = paste0(IO$output_clue, "users.feather"), to = paste0(IO$tmp_clue, "users_with_batches.feather"))## [1] TRUE
tracking_folder = paste0(IO$input_clue,"tracking/")
files = list.files(tracking_folder)
tmp_folder = paste0(IO$tmp_clue,"tracking_split_in_batches/")
if(!file.exists(tmp_folder)){dir.create(tmp_folder,recursive = TRUE)}else{file.remove(paste0(tmp_folder,list.files(tmp_folder)))}
cl = makeCluster(par$n_cores)
registerDoParallel(cl)
tic()
users_original_file_ids = foreach(file = files, .combine = rbind, .packages = c("feather","stringr")) %dopar%
{
tracking = read_feather(path = paste0(tracking_folder, file))
dim(tracking)
tracking$batch = users$batch[match(tracking$user_id, users$user_id)]
if(nrow(tracking)>0){
for(b in unique(tracking$batch)){
tracking_batch = tracking[which(tracking$batch == b),]
new_file_name = gsub(".feather",paste0("_batch_",b,".feather"),file)
write_feather(tracking_batch, path = paste0(tmp_folder, new_file_name ))
}
users_original_file_ids = data.frame(user_id = unique(tracking$user_id), original_file_id = str_extract(file,"\\d{4}"))
}else{ users_original_file_ids = data.frame(user_id = character(), original_file_id = character())}
return(users_original_file_ids)
}
toc()## 4.548 sec elapsed
stopImplicitCluster()
users_o_f = aggregate(original_file_id ~ user_id ,users_original_file_ids, function(x) paste0(sort(x), collapse = ","))
colnames(users_o_f) = c("user_id","original_tracking_files")
write_feather(users_o_f, path = paste0(IO$tmp_clue,"original_tracking_files_per_users.feather"))input_folder = paste0(IO$tmp_clue,"tracking_split_in_batches/")
output_folder = paste0(IO$output_clue,"tracking/")
if(file.exists(output_folder)){unlink(output_folder, recursive = TRUE)} ;dir.create(output_folder,recursive = TRUE)
tmp_folder = paste0(IO$tmp_clue,"tracking_in_batches/")
if(file.exists(tmp_folder)){unlink(tmp_folder, recursive = TRUE)} ;dir.create(tmp_folder,recursive = TRUE)
batches = unique(users$batch)
ok = foreach(b = batches) %do% {
cat(b,"\n")
all_files = list.files(input_folder)
files = all_files[grep(paste0("_batch_",b,"\\."),all_files)]
cl = makeCluster(par$n_cores)
registerDoParallel(cl)
tracking = foreach(file = files, .combine = rbind, .packages = "feather") %dopar%{tracking = read_feather(path = paste0(input_folder, file));return(tracking)}
stopImplicitCluster()
cols_to_add = c("birth_year_bin","country_area","height_bin","weight_bin", "est_mean_bmi")
m = match(tracking$user_id, users$user_id)
for(col_to_add in cols_to_add){
eval(parse(text = paste0("tracking$",col_to_add," = users$",col_to_add,"[m]")))
}
o = order(tracking$user_id, tracking$date, tracking$category, tracking$type, tracking$number)
tracking = tracking[o,]
new_file_name = paste0("tracking_",b,".feather")
write_feather(tracking, path = paste0(output_folder, new_file_name ))
file.copy(from = paste0(output_folder, new_file_name ), to = paste0(tmp_folder, new_file_name), overwrite = TRUE)
}## 1
## 2
## 3
## 4
## 5
tracking_folder = paste0(IO$output_clue,"tracking/")
files = list.files(tracking_folder)
cl = makeCluster(par$n_cores)
registerDoParallel(cl)
tic()
users_agg = foreach(file = files, .combine = rbind, .packages = c("feather","stringr", "plyr")) %dopar%
{
tracking = read_feather(path = paste0(tracking_folder, file))
users_agg = ddply(tracking,
.(user_id),
summarize,
first_obs = min(date),
last_obs = max(date),
n_obs_day = length(unique(date)),
n_obs = sum(category != "ANY"),
n_sex = sum(category == "sex"),
n_mucus = sum(category == "fluid"),
n_temp = sum(category == "bbt")
)
return(users_agg)
}
toc()## 11.46 sec elapsed
stopImplicitCluster()
write_feather(users_agg, path = paste0(IO$tmp_clue, "users_agg.feather"))
cols_to_add = setdiff(colnames(users_agg),"user_id")
m = match(users$user_id, users_agg$user_id)
for(col_to_add in cols_to_add){eval(parse(text = paste0("users$",col_to_add," = users_agg$",col_to_add,"[m]")))}
users$n_days = as.numeric(users$last_obs - users$first_obs)
write_feather(users, path = paste0(IO$output_clue, "users.feather"))
file.copy(from = paste0(IO$output_clue, "users.feather"), to = paste0(IO$tmp_clue, "users_augmented.feather"))## [1] TRUE
As declared by the users:
BC = read_feather(path = paste0(IO$input_clue,"birth_control.feather"))
o = order(BC$user_id, BC$date)
BC = BC[o,]
BC$birth_control = replace_na(BC$birth_control, "undefined")
write_feather(BC, path = paste0(IO$output_clue, "birth_control.feather"))BC = read_feather(path = paste0(IO$output_clue,"birth_control.feather"))
input_folder = paste0(IO$tmp_clue,"tracking_in_batches/")
output_folder = paste0(IO$output_clue,"tracking/")
tmp_folder = paste0(IO$tmp_clue, "tracking_with_user_defined_birth_control/")
if(!dir.exists(tmp_folder)){dir.create(tmp_folder)}
files = list.files(input_folder)
cl = makeCluster(par$n_cores)
registerDoParallel(cl)
tic()
catch = foreach(file = files, .combine = rbind, .packages = c("feather","stringr", "plyr","tidyverse")) %dopar%
{
tracking = read_feather(path = paste0(input_folder, file))
agg = aggregate(date ~ user_id, tracking, min)
min_date = agg$date[match(tracking$user_id, agg$user_id)]
tracking = tracking %>% mutate(rel_date = as.numeric(date - min_date + 1))
u_tracking = tracking %>% select(user_id, date) %>% unique() %>% mutate(birth_control = NA,pill_type = NA, pill_regiment = NA)
u_BC = BC %>% select(-csv_file) %>% filter(user_id %in% unique(tracking$user_id)) %>% unique()
BC_exp = rbind(u_tracking, u_BC) %>% arrange(., user_id, date)
# need to expand the birth_control variables to replace the NAs with the latest value in the file
BC_exp = BC_exp %>% group_by(user_id) %>% mutate(
birth_control = replace_NAs_with_latest_value(birth_control) %>% replace_na("undefined"),
pill_type = replace_NAs_with_latest_value(pill_type),
pill_regiment = replace_NAs_with_latest_value(pill_regiment)
)
tracking_key = str_c(tracking$user_id, "_",tracking$date)
BC_key = str_c(BC_exp$user_id,"_",BC_exp$date)
m = match(tracking_key,BC_key)
tracking$birth_control_ud = BC_exp$birth_control[m]
tracking$pill_type_ud = BC_exp$pill_type[m]
tracking$pill_regiment_ud = BC_exp$pill_regiment[m]
write_feather(tracking, path = paste0(output_folder,file))
file.copy(from = paste0(output_folder,file), to = paste0(tmp_folder,file), overwrite = TRUE)
}
toc()## 39.986 sec elapsed
stopImplicitCluster()active_tracking_filter = function(x, N = 28*1.5){
#res = pmin(1,stats::filter(x, filter = rep(1,N), sides = 1))
#res = pmin(1,stats::filter(x, filter = rep(1,N), method = "recursive"))
#res[is.na(res)] = 0
res = pmin(1,stats::filter(c(rep(0,N-1),x), filter = rep(1,N), sides = 1))
res = res[-which(is.na(res))]
return(res)
}input_folder = paste0(IO$output_clue,"tracking/")
files = list.files(input_folder)
output_folder = paste0(IO$tmp_clue, "active_tracking/")
if(!dir.exists(output_folder)){dir.create(output_folder)}
time_vec = seq(min(users$first_obs), max(users$last_obs), by = 1)
cl = makeCluster(par$n_cores)
registerDoParallel(cl)
tic()
ok = foreach(file = files, .packages = c("dplyr", "feather"), .export = "active_tracking_filter") %dopar% {
tracking = read_feather(path = paste0(input_folder, file))
any_tracking = tracking %>% filter(. , category == "ANY") %>% select(c(user_id, date, birth_control_ud))
any_tracking$tracking_days = 1
tmp = data.frame(user_id = rep(unique(tracking$user_id), each = length(time_vec)), date = rep(time_vec, lu(tracking$user_id)))
active_tracking = dplyr::full_join(tmp,any_tracking, by = c("user_id","date"))
active_tracking$tracking_days[is.na(active_tracking$tracking_days)] = 0
active_tracking$birth_control_ud = ave(active_tracking$birth_control_ud, by = active_tracking$user_id, FUN = replace_NAs_with_latest_value)
active_tracking = active_tracking %>% group_by(user_id) %>%
mutate(., tracking = active_tracking_filter(tracking_days))
active_tracking = active_tracking %>% select(user_id, date,birth_control_ud, tracking)
# compressed table
active_tracking = active_tracking %>% group_by(user_id, birth_control_ud) %>% mutate(transition = diff(c(0, tracking)))
active_tracking = active_tracking %>% group_by(user_id) %>% mutate(stretch_num = cumsum(transition == 1))
active_tracking = active_tracking %>% group_by(user_id, birth_control_ud, stretch_num, tracking) %>%
mutate(stretch_length = sum(tracking))
compressed_tracking = active_tracking %>% filter(transition == 1) %>% select(user_id, date,birth_control_ud, stretch_num, stretch_length) %>% rename(start_date = date)
file_name = paste0("active_",file)
write_feather(compressed_tracking, path = paste0(output_folder,file_name))
}## Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
## exporting variable(s): active_tracking_filter
toc()## 22.868 sec elapsed
stopImplicitCluster()users = read_feather(path = paste0(IO$output_clue, "users.feather"))time_vec = seq(min(users$first_obs), max(users$last_obs), by = 1)input_folder = paste0(IO$output_clue,"tracking/")
files = list.files(input_folder)
input_active_tracking = paste0(IO$tmp_clue,"active_tracking/")
tracking_pop_agg = foreach(file = files, .combine = rbind, .packages = c("dplyr","tidyverse")) %do% {
# tracking
tracking = read_feather(path = paste0(input_folder, file))
tracking$BC = dict$BC$type[match(tracking$birth_control_ud, dict$BC$birth_control)]
tracking = filter(tracking, BC %in% c("F","I"))
# variables aggregates
tracking_pop_agg = ddply(tracking,
.(date,country_area,BC),
summarise,
n_prot_sex = sum((category == "sex") & (type == "protected_sex"), na.rm = TRUE),
n_unprot_sex = sum((category == "sex") & (type == "unprotected_sex"), na.rm = TRUE),
n_wd_sex = sum((category == "sex") & (type == "withdrawal_sex"), na.rm = TRUE)
)
tracking_pop_agg = tracking_pop_agg %>% mutate(n_sex = n_prot_sex + n_unprot_sex + n_wd_sex)
# active tracking
active_tracking_compressed = read_feather(path = paste0(input_active_tracking,"active_",file))
active_tracking = expand_compressed_tracking(active_tracking_compressed)
active_tracking$BC = dict$BC$type[match(active_tracking$birth_control_ud, dict$BC$birth_control)]
active_tracking = filter(active_tracking, BC %in% c("F","I"))
# total number of users
active_tracking$country_area = tracking$country_area[match(active_tracking$user_id, tracking$user_id)]
active_tracking_agg = ddply(active_tracking,
.(date,country_area,BC),
summarise,
n_users = sum(tracking, na.rm = TRUE)
)
tmp = dplyr::full_join(x = active_tracking_agg , y = tracking_pop_agg, by = c("date","country_area","BC")) %>%
arrange(country_area, BC, date) %>%
replace_na(list(n_prot_sex = 0,n_unprot_sex = 0, n_wd_sex= 0, n_sex = 0))
return(tmp)
}
# sum the results from all files
tmp = tracking_pop_agg %>% group_by(date, country_area, BC) %>%
summarise_each(.,sum) %>% arrange(country_area, BC, date)
# expand to have one row per possible date
tmp2 = expand.grid(date = time_vec, country_area = unique(tmp$country_area), BC = unique(tmp$BC))
tmp3 = dplyr::full_join(tmp, tmp2, by = c("date","country_area","BC")) %>%
arrange(country_area, BC, date) %>%
replace_na(., list(n_users = 0,n_prot_sex = 0,n_unprot_sex = 0, n_wd_sex= 0, n_sex = 0))## Warning: Column `country_area` joining character vector and factor,
## coercing into character vector
## Warning: Column `BC` joining character vector and factor, coercing into
## character vector
# final table
tracking_pop_agg = tmp3
# save the results
indicators_folder = paste0(IO$output_clue, "pop_indicators/")
if(!dir.exists(indicators_folder)){dir.create(indicators_folder)}
write_feather(tracking_pop_agg, path = paste0(indicators_folder, "sex_pop_indicators.feather"))ggplot(tracking_pop_agg, aes(x = date, y = n_users, col = BC))+
geom_line()+
facet_wrap(country_area ~ .)ggplot(tracking_pop_agg, aes(x = date, y = n_sex/n_users, col = BC))+
geom_line()+
facet_wrap(country_area ~ .)ggplot(tracking_pop_agg %>% filter(date > as.Date("2017-06-30")), aes(x = date, y = n_sex/n_users, col = BC))+
geom_line()+
facet_grid(country_area ~ BC)ggplot(tracking_pop_agg %>% filter(date > as.Date("2017-06-30")), aes(x = date, y = n_prot_sex/n_users, col = BC))+
geom_line()+
facet_grid(country_area ~ BC)ggplot(tracking_pop_agg %>% filter(date > as.Date("2017-06-30")), aes(x = date, y = n_unprot_sex/n_users, col = BC))+
geom_line()+
facet_grid(country_area ~ BC)tracking_pop_agg = tracking_pop_agg %>% mutate(weekday = wday(date, week_start = 1),
month = month(date),
date_month = year(date)+(month-1)/12)
ggplot(tracking_pop_agg %>% filter(date > as.Date("2017-06-30")),
aes(x = factor(weekday), y = n_sex/n_users, col = BC))+
geom_violin(draw_quantiles = 0.5)+
facet_grid(country_area ~ BC)## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to
## unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to
## unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to
## unique 'x' values
ggplot(tracking_pop_agg %>% filter(date > as.Date("2017-06-30")),
aes(x = factor(month), y = n_sex/n_users, col = BC))+
geom_violin(draw_quantiles = 0.5)+
facet_grid(country_area ~ BC)ggplot(tracking_pop_agg %>% filter(date > as.Date("2017-06-30")),
aes(x = factor(date_month), y = n_sex/n_users, col = BC))+
geom_violin(draw_quantiles = 0.5)+
facet_grid(country_area ~ BC)ggplot(tracking_pop_agg %>% filter(date > as.Date("2017-06-30")),
aes(x = date_month, y = n_sex/n_users, col = BC, fill = BC))+
stat_summary(geom="ribbon",
fun.ymin = function(x) quantile(x, 0.05),
fun.ymax = function(x) quantile(x, 0.95),
alpha = 0.3, col = NA) +
stat_summary(geom="ribbon",
fun.ymin = function(x) quantile(x, 0.25),
fun.ymax = function(x) quantile(x, 0.75),
alpha = 0.3, col = NA) +
stat_summary(geom = "line", fun.y=median) +
facet_grid(country_area ~ BC, scale = "free_y")ggplot(tracking_pop_agg %>% filter(date > as.Date("2017-06-30")),
aes(x = date_month, y = n_sex/n_users, col = country_area))+
stat_summary(geom = "line", fun.y=median) +
facet_grid(. ~ BC)users = read_feather(path = paste0(IO$output_clue, "users.feather"))users$country_area = factor(users$country_area, levels = names(sort(table(users$country_area))))
ggplot(users, aes(x = country_area, fill = country_area)) + coord_flip() + geom_bar() + guides(fill = FALSE)ggplot(users, aes(x = birth_year_bin)) + geom_bar() + facet_grid(country_area ~ .) + guides(fill = FALSE)ggplot(users, aes(x = height_bin)) + geom_bar() + facet_grid(country_area ~ .) + guides(fill = FALSE)ggplot(users, aes(x = weight_bin)) + geom_bar() + facet_grid(country_area ~ .) + guides(fill = FALSE)ggplot(users, aes(x = est_mean_bmi)) + geom_histogram(binwidth = 1) + facet_grid(country_area ~ .) + guides(fill = FALSE)## Warning: Removed 2730 rows containing non-finite values (stat_bin).
users$latest_birth_control = factor(users$latest_birth_control, levels = names(sort(table(users$latest_birth_control))))
ggplot(users, aes(x = latest_birth_control, fill = latest_birth_control)) + coord_flip() + geom_bar() + guides(fill = FALSE)